import pandas as pd # Manipulación de datos
import plotly.express as px # Gráficos interáctivos.
import warnings # Ignorar mensajes de advertencias.
warnings.filterwarnings("ignore")
df=pd.read_csv("insurence_clear")
df=df.drop(["Unnamed: 0"],axis="columns")
df["sex"]=df["sex"].apply(lambda x: 1 if x =="male" else 0)
df["smoker"]=df["smoker"].apply(lambda x: 1 if x =="yes" else 0)
Con el objetivo para que las varaibles sean comparables entre sí. Y tengan una escala en común.
from sklearn.preprocessing import MinMaxScaler
rescale=MinMaxScaler()
def rescale_many(cols):
for col in cols:
df[col]=rescale.fit_transform(df[[col]])
return df
df=rescale_many(['age', 'bmi', 'children'])
df.head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 0.000000 | 1 | 0.543793 | 0.2 | 0 | southeast | 1725.5523 |
| 1 | 0.217391 | 1 | 0.517241 | 0.6 | 0 | southeast | 4449.4620 |
| 2 | 0.152174 | 1 | 0.283448 | 0.0 | 0 | northeast | 2721.3208 |
| 3 | 0.108696 | 1 | 0.565517 | 0.0 | 0 | southwest | 1826.8430 |
| 4 | 0.021739 | 1 | 0.227586 | 0.2 | 0 | southwest | 1837.2370 |
df=pd.get_dummies(df)
df.head()
| age | sex | bmi | children | smoker | charges | region_northeast | region_northwest | region_southeast | region_southwest | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.000000 | 1 | 0.543793 | 0.2 | 0 | 1725.5523 | 0 | 0 | 1 | 0 |
| 1 | 0.217391 | 1 | 0.517241 | 0.6 | 0 | 4449.4620 | 0 | 0 | 1 | 0 |
| 2 | 0.152174 | 1 | 0.283448 | 0.0 | 0 | 2721.3208 | 1 | 0 | 0 | 0 |
| 3 | 0.108696 | 1 | 0.565517 | 0.0 | 0 | 1826.8430 | 0 | 0 | 0 | 1 |
| 4 | 0.021739 | 1 | 0.227586 | 0.2 | 0 | 1837.2370 | 0 | 0 | 0 | 1 |
def load_inputs_outputs():
return df.drop(["charges"],axis="columns").values,df.charges.values
X,y=load_inputs_outputs()
from sklearn.model_selection import train_test_split,cross_val_score
X_train,X_test,Y_train,Y_test=train_test_split(X,y,
test_size=0.33, # Seleccionamos el 33% para realizar validación.
random_state=42)
X_train.shape[0],X_test.shape[0]
(896, 442)
from sklearn.ensemble import GradientBoostingRegressor
Importamos el algoritmo con mejores resultados en base los resultados anteriores.
gbr=GradientBoostingRegressor(max_depth=2,
n_estimators=616,
learning_rate=0.01,
random_state=42)
Asiganamos los mismos parámetros. Que habiamos establecido previamente.
gbr.fit(X_train,Y_train)
GradientBoostingRegressor(learning_rate=0.01, max_depth=2, n_estimators=616,
random_state=42)
cv=cross_val_score(gbr,X_test,Y_test,cv=10).mean()
Mide el promedio de eror entre el valor original y el predicho.
from sklearn.metrics import mean_squared_error
def mse(x,y_true):
y_pred=gbr.predict(x)
return mean_squared_error(y_true=y_true,y_pred=y_pred)
mse_train=mse(X_train,Y_train)
mse_test=mse(X_test,Y_test)
score_train=gbr.score(X_train,Y_train)
score_test=gbr.score(X_test,Y_test)
df_evaluate=pd.DataFrame({"mse train ":mse_train.flatten(),
"mse test":mse_test.flatten(),
"score train":score_train.flatten(),
"score test":score_test.flatten()})
df_evaluate
| mse train | mse test | score train | score test | |
|---|---|---|---|---|
| 0 | 2.009736e+06 | 2.559540e+06 | 0.98511 | 0.981559 |
Tiene las métricas idéntica al modelo que desarrollé anteriormente en los experimentos.
y_pred=gbr.predict(X_test).flatten()
df_test=pd.DataFrame({"y_true":Y_test,"y_pred":y_pred})
px.scatter(df_test,x="y_true",
y="y_pred",trendline="ols",
title="True values vs Predicted values")
df_test.sample(n=30,random_state=30)
| y_true | y_pred | |
|---|---|---|
| 148 | 16138.762050 | 18451.967189 |
| 408 | 23568.272000 | 25433.300915 |
| 214 | 17904.527050 | 18937.038673 |
| 67 | 2464.618800 | 2642.999860 |
| 241 | 13981.850350 | 13479.627212 |
| 205 | 3172.018000 | 3793.734197 |
| 126 | 5910.944000 | 6495.118995 |
| 193 | 1705.624500 | 1857.264128 |
| 338 | 5321.133214 | 5342.585218 |
| 317 | 42303.692150 | 41869.632511 |
| 344 | 6435.623700 | 7127.608603 |
| 31 | 15555.188750 | 14197.766059 |
| 164 | 17560.379750 | 19428.403894 |
| 212 | 4340.440900 | 4488.017626 |
| 213 | 21880.820000 | 24056.856382 |
| 201 | 5152.134000 | 4994.226960 |
| 342 | 19444.265800 | 20876.004786 |
| 236 | 1534.304500 | 2364.243335 |
| 211 | 4536.259000 | 5525.743253 |
| 384 | 2257.505089 | 2705.120348 |
| 304 | 8891.139500 | 8445.980527 |
| 230 | 7222.786250 | 8254.031325 |
| 419 | 6686.431300 | 6583.084791 |
| 370 | 4243.590050 | 4615.464951 |
| 280 | 36149.483500 | 36305.325876 |
| 22 | 4719.524050 | 4305.292082 |
| 281 | 2585.850650 | 2011.512630 |
| 296 | 5438.749100 | 6669.416189 |
| 100 | 2723.957387 | 2618.762398 |
| 221 | 3577.999000 | 3352.136274 |
En la mayoría de los datos da predicciones muy cercanas al valor original. Por lo cual dicho algoritmo es apto para resolver este problema.
import joblib # Joblib nos permitirá guardar el modelo. Para no ver la necesidad de volverlo a entrenar.
joblib.dump(gbr,"gbr_insurence.pkl") # Le asigamos un nombre. Y lo guardamos en formato .pkl
['gbr_insurence.pkl']